Personal income decreased $216.2 billion, or 1.0 percent at a monthly rate, while consumer spending increased $93.4 billion, or 0.6 percent, in September. The decrease in personal income primarily reflected the winding down of pandemic-related assistance programs.
TO DOs: * linear model: consumption ~ time, COVID-19
We are insterested in exploring the how the changes in the following variables in personal income might affect PCE:
v.s. * Personal consumption expenditures (on goods and services)
We want to look at data from 1967Q1 to 2020Q4
format_function <- function(input) {
str_c(str_split(input, 'q')[[1]][1], ' Quarter ', str_split(input, 'q')[[1]][2])
}
pce_4720 <- read_excel("data/pce_1947_2020.xlsx", sheet = 'T20100-Q',
range = 'A8:KM54') %>%
janitor::clean_names() %>%
select(x2, x1967q1:x2020q4) %>%
drop_na() %>%
# We only want to look at several variables:
filter(
x2 == 'Wages and salaries' |
x2 == 'Supplements to wages and salaries' |
x2 == 'Social security' |
x2 == 'Medicare' |
x2 == 'Less: Personal current taxes' |
x2 == 'Personal consumption expenditures'
) %>%
pivot_longer(
x1967q1:x2020q4,
names_to = 'time',
names_prefix = 'x',
values_to = 'millions_of_dollars'
) %>%
rename(variable = x2) %>%
mutate(
variable = ifelse(variable == 'Less: Personal current taxes', 'Personal current taxes', variable)
)
#separate(time, c("year", "quarter"), "q") %>%
pce_4720_formatted <- pce_4720 %>%
mutate(
time = map(.x = time, format_function)
)
Let’s take a brief look at the changes of variables over time
fig_1 <- pce_4720_formatted %>%
plot_ly(x = ~time, y = ~millions_of_dollars, type = "scatter", mode = "lines", color = ~variable) %>%
layout(title = "<b> Spaghetti Plot for Personal Income and Dispositions v.s. PCE <b>", yaixs = list(title = "<i> Millions of Dollars <i>"), barmode = "stack", legend = list(title = list(text = '<b> Dispositions </b>'))) %>%
layout(legend = list(orientation = 'h', x = 0, y = -0.2))
fig_1
A log graph will be more informative
fig_2 <- pce_4720_formatted %>%
plot_ly(x = ~time, y = ~log(millions_of_dollars), type = "scatter", mode = "lines", color = ~variable) %>%
layout(title = "<b> Spaghetti Plot for log(Personal Income and Dispositions) v.s. PCE </b>", yaixs = list(title = "log(Millions of Dollars)"), barmode = "stack", legend = list(title = list(text = '<i> Dispositions </i>'))) %>%
layout(legend = list(orientation = 'h', x = 0, y = -0.2))
fig_2
Now let’s look at data pre- v.s. post-COVID-19 Pandemic
pce_4718_by_dis <-
pce_4720 %>%
filter(!str_detect(time, '2019|2020')) %>%
pivot_wider(
names_from = variable,
values_from = millions_of_dollars
) %>%
janitor::clean_names()
smooth_mod = mgcv::gam(
log(personal_consumption_expenditures) ~ s(log(wages_and_salaries)) +
s(log(social_security)) + s(log(supplements_to_wages_and_salaries)) + s(log(personal_current_taxes)) + s(log(medicare)),
data = pce_4718_by_dis)
summary(smooth_mod)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(personal_consumption_expenditures) ~ s(log(wages_and_salaries)) +
## s(log(social_security)) + s(log(supplements_to_wages_and_salaries)) +
## s(log(personal_current_taxes)) + s(log(medicare))
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.510e+01 3.668e-04 41171 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(log(wages_and_salaries)) 1.000 1.000 104.907 < 2e-16 ***
## s(log(social_security)) 3.201 4.180 4.120 0.00507 **
## s(log(supplements_to_wages_and_salaries)) 9.000 9.000 21.982 < 2e-16 ***
## s(log(personal_current_taxes)) 8.153 8.785 6.871 < 2e-16 ***
## s(log(medicare)) 8.815 8.973 8.551 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 1 Deviance explained = 100%
## GCV = 3.2911e-05 Scale est. = 2.7979e-05 n = 208
smooth_mod %>% broom::tidy()
## # A tibble: 5 Ă— 5
## term edf ref.df statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 s(log(wages_and_salaries)) 1.00 1.00 105. 0
## 2 s(log(social_security)) 3.20 4.18 4.12 0.00507
## 3 s(log(supplements_to_wages_and_salaries)) 9.00 9.00 22.0 0
## 4 s(log(personal_current_taxes)) 8.15 8.79 6.87 0
## 5 s(log(medicare)) 8.82 8.97 8.55 0
smooth_mod.fit <- mgcViz::getViz(smooth_mod)
qqplot <- mgcViz::qq(smooth_mod.fit, method = "simul1", a.qqpoi = list("shape" = 1), a.ablin = list("linetype" = 2))
qqplot
Now let’s predict the PCE using our model
pce_2019 <-
pce_4720 %>%
filter(
str_detect(time, '2019')
) %>%
pivot_wider(
names_from = variable,
values_from = millions_of_dollars
) %>%
janitor::clean_names()
pce_2019_pred <- predict.gam(smooth_mod,pce_2019)
pce_2019_pred
## 1 2 3 4
## 16.48962 16.49738 16.50608 16.51574
log(pce_2019$personal_consumption_expenditures)
## [1] 16.47413 16.48947 16.49963 16.50738
#RMSQ
rmse(smooth_mod, pce_2019)
## [1] 0.01017228
pce_2020 <-
pce_4720 %>%
filter(
str_detect(time, '2020')
) %>%
pivot_wider(
names_from = variable,
values_from = millions_of_dollars
) %>%
janitor::clean_names()
pce_2020_pred <- predict.gam(smooth_mod,pce_2020)
pce_2020_pred
## 1 2 3 4
## 16.52307 16.50159 16.53446 16.55342
log(pce_2020$personal_consumption_expenditures)
## [1] 16.49279 16.38792 16.48285 16.49221
#RMSQ
rmse(smooth_mod, pce_2020)
## [1] 0.07114881
Hypothesize: there is significant difference between our model prediction for pre and post pandemic periods
Since there is not enough dataset for post pandemic, we need to bootstrap a set of new samples from year 2020 and 2021
# Bootstrapping? Testing on 1967-2016
# include dataset from 2021q1, 2021q2, 2021q3
pce_4720_raw <- read_excel("data/pce_1947_2020.xlsx", sheet = 'T20100-Q',
range = 'A8:KM54') %>%
janitor::clean_names() %>%
select(x2, x1967q1:x2020q4) %>%
drop_na() %>%
# We only want to look at several variables:
filter(
x2 == 'Wages and salaries' |
x2 == 'Supplements to wages and salaries' |
x2 == 'Social security' |
x2 == 'Medicare' |
x2 == 'Less: Personal current taxes' |
x2 == 'Personal consumption expenditures'
) %>%
rename(variable = x2)
rownames(pce_4720_raw) <- pull(pce_4720_raw, variable)
pce_4720_raw <- mutate(pce_4720_raw, variable = NULL)
pce_4720_wide <- as.data.frame(t(as.matrix(pce_4720_raw))) %>%
janitor::clean_names() %>%
rename(personal_current_taxes = less_personal_current_taxes)
Select samples data set
pce_2019_df <- subset(pce_4720_wide, str_detect(rownames(pce_4720_wide), '2019'))
pce_2020_df <- subset(pce_4720_wide, str_detect(rownames(pce_4720_wide), '2020'))
bootstrapping
boot_straps_19 =
pce_2019_df %>%
modelr::bootstrap(n = 1000)
boot_straps_20 =
pce_2020_df %>%
modelr::bootstrap(n = 1000)
Compute rmse
rmse_compute <- function(data){
data <- as.data.frame(data)
return(rmse(smooth_mod, data))
}
rmse_19 <- boot_straps_19 %>%
mutate(
rmse = map_dbl(strap, rmse_compute)
) %>%
select(rmse)
rmse_20 <- boot_straps_20 %>%
mutate(
rmse = map_dbl(strap, rmse_compute)
) %>%
select(rmse)
Perform test
t.test(rmse_19,rmse_20)
##
## Welch Two Sample t-test
##
## data: rmse_19 and rmse_20
## t = -108.23, df = 1026.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.05852254 -0.05643821
## sample estimates:
## mean of x mean of y
## 0.009950766 0.067431140
We reject the null hypothesis and conclude that there is significant difference between our prediction of personal consumption expenditures for year 2019 and year 2020.